# -*- coding: utf-8 -*-

"""
Datetime: 2019/6/11
Author: Zhang Yafei
Description: 核心类
"""
import asyncio
import functools
import importlib
import time
from collections.abc import Iterable, AsyncGenerator
from types import MethodType

import aiohttp
import async_timeout

import settings


def timeit(func):
    """
    装饰器： 判断函数执行时间
    :param func:
    :return:
    """

    @functools.wraps(func)
    def inner(*args, **kwargs):
        start = time.time()
        ret = func(*args, **kwargs)
        end = time.time() - start
        if end < 60:
            print(f'花费时间：\t{round(end, 2)}秒')
        else:
            min, sec = divmod(end, 60)
            print(f'花费时间\t{round(min)}分\t{round(sec, 2)}秒')
        return ret

    return inner


class Request(object):
    """
    用于封装用户请求信息所用
    """

    def __init__(self, url: str, callback: MethodType, data: dict = None, json: str = None, meta: dict = None,
                 headers: dict = None, **kwargs):
        self.url = url
        self.callback = callback
        self.data = data
        self.json = json
        if headers:
            self.headers = headers
        else:
            self.headers = settings.headers if settings.headers else None
        self.meta = meta
        self.kwargs = kwargs


class Response(object):
    """ 用于封装用户请求的响应信息 """

    def __init__(self, response, request):
        self._response = response
        self.text = response.text
        self.read = response.read
        self.url = response.url
        self.status = response.status
        self.request = request


class Scheduler(object):
    """
    任务调度器:
    1.初始化两个集合： 新请求集合和访问过的请求集合
    2.has_new_request：判断是否有未爬取的url
    3.get_new_request：获取一个未爬取的请求
    4.add_new_request：将新请求添加到未爬取的集合中（单个请求）
    5.add_new_requests：将新请求添加到未爬取的集合中（多个请求）
    6.new_request_size: 新请求集合大小
    7.old_request_size：访问过的请求集合大小
    """

    def __init__(self):
        self.new_requests = set()
        self.old_requests = set()

    def has_new_request(self):
        """ 判断是否有未爬取的url """
        return self.new_request_size() != 0

    async def get_new_request(self):
        """ 获取一个未爬取的请求 """
        new_request = self.new_requests.pop()
        # 提取之后，将其添加到已爬取的链接中
        self.old_requests.add(new_request)
        return new_request

    async def add_new_request(self, request):
        """ 将新请求添加到未爬取的集合中(单个请求) """
        if request is None:
            return
        if request not in self.new_requests and request not in self.old_requests:
            self.new_requests.add(request)

    async def add_new_requests(self, requests):
        """ 将新请求添加到未爬取的集合中(集合) """
        if not isinstance(requests, Iterable):
            return
        for request in requests:
            await self.add_new_request(request)

    def new_request_size(self):
        """ 获取未爬取的url大小 """
        return len(self.new_requests)

    def old_request_size(self):
        """ 获取已爬取的url大小 """
        return len(self.old_requests)


class Crawler(object):
    """ 爬虫类 """

    def __init__(self, loop=None, pool=100):
        self.scheduler = Scheduler()
        self.urls = None
        self.pool = pool
        self.loop = loop

    @staticmethod
    def _create_spider():
        module_path, cls_name = settings.Spider_Name.rsplit('.', maxsplit=1)
        m = importlib.import_module(module_path)
        cls = getattr(m, cls_name)
        return cls()

    async def start(self, session):
        if hasattr(settings, 'PIPELINE'):
            pipe_setting = True
            module_path, cls_name = settings.PIPELINE.rsplit('.', maxsplit=1)
            module = importlib.import_module(module_path)
            pipeline = getattr(module, cls_name)
        while self.scheduler.has_new_request():
            # 1. 取出新请求
            new_request = await self.scheduler.get_new_request()
            # 2. 将请求的url进行下载
            url = new_request.url
            sem = asyncio.Semaphore(self.pool, loop=self.loop)
            async with sem:
                print(f'make request to {url}')
                with async_timeout.timeout(60):
                    kwargs = new_request.kwargs
                    async with session.get(url=url, data=new_request.data, json=new_request.json,
                                           headers=new_request.headers,
                                           verify_ssl=False, **kwargs) as response:
                        response = Response(response, new_request)
                        if response.status == 200:
                            # 3. 将下载的Html文本进行解析
                            try:
                                result = await new_request.callback(response=response)
                            except TypeError:
                                result = new_request.callback(response=response)
                            if result:
                                # 4. 判断解析之后返回的数据对象为新请求还是解析的数据
                                if not isinstance(result, Iterable) and not isinstance(result, AsyncGenerator):
                                    raise Exception('返回的数据类型不可迭代')
                                if isinstance(result, Iterable):
                                    for ret in result:
                                        if isinstance(ret, Request):
                                            # 5. 如果是新请求，则加入到请求管理器
                                            await self.scheduler.add_new_request(ret)
                                        else:
                                            raise Exception('返回可迭代对象只能为Request对象')
                                elif isinstance(result, AsyncGenerator):
                                    data_item = False
                                    if pipe_setting:
                                        if hasattr(settings, 'TO_FILE'):
                                            pipe = pipeline(file_path=settings.TO_FILE)
                                        else:
                                            raise Exception('PIPELINE设置的同时，TO_FILE必须指定')
                                    async for ret in result:
                                        if isinstance(ret, Request):
                                            # 5. 如果是新请求，则加入到请求管理器
                                            await self.scheduler.add_new_request(ret)
                                        elif isinstance(ret, dict):
                                            if not pipe_setting:
                                                break
                                            data_item = True
                                            await pipe.add_item(ret)
                                        else:
                                            raise Exception('返回可迭代对象只能为Request对象或字典')
                                    if data_item:
                                        await pipe.item_completed()
                                        print('下载成功')
                        else:
                            print(f'{new_request.url}\t请求失败\t{response.status}')

    async def open_spider(self):
        spider = self._create_spider()
        if hasattr(spider, 'filter_downloaded_urls'):
            spider.filter_downloaded_urls()
        await self.scheduler.add_new_requests(spider.start_request())

    async def run(self, loop):
        """ 异步执行爬虫项目 """
        await self.open_spider()
        print('*******************开始下载***********************')
        self.loop = loop
        conn = aiohttp.TCPConnector(ssl=False, limit=100, use_dns_cache=True)
        async with aiohttp.ClientSession(connector=conn, loop=loop) as session:
            futures = await asyncio.gather(*[self.start(session)])
            for future in futures:
                if isinstance(future, Exception):
                    print(future)


class Manager(object):
    """ 程序启动管理器 """

    @staticmethod
    def run():
        """ 主函数 """
        crawl = Crawler()
        event_loop = asyncio.get_event_loop()
        event_loop.run_until_complete(crawl.run(event_loop))
        event_loop.close()
